Barriers in healthcare Utilization (VSURF)

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(caret)
library(ggRandomForests)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Unmet Health Need

rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Unmet.Health.Need~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Health.Need~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Unmet.Health.Need ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 23.3 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5 secs)
    15 variables at interpretation step (in 2.9 secs)
    14 variables at prediction step (in 15.4 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "English.Speaking"               "Dental.Insurance"              
 [3] "Religion"                       "Discrimination"                
 [5] "Ethnicity"                      "English.Difficulties"          
 [7] "Income_median"                  "Belonging"                     
 [9] "Familiarity.with.America"       "Identify.Ethnically"           
[11] "Age"                            "Familiarity.with.Ethnic.Origin"
[13] "Gender"                         "Full.Time.Employment"          
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "English.Speaking"               "Dental.Insurance"              
 [3] "Religion"                       "Discrimination"                
 [5] "Ethnicity"                      "English.Difficulties"          
 [7] "Income_median"                  "Belonging"                     
 [9] "Familiarity.with.America"       "Identify.Ethnically"           
[11] "Age"                            "Familiarity.with.Ethnic.Origin"
[13] "Duration.of.Residency"          "Gender"                        
[15] "Full.Time.Employment"          
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.06919959

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                English.Speaking    0.10924       0.00212 black
2                Dental.Insurance    0.08979       0.00189 black
3                        Religion    0.08374       0.00083 black
4                  Discrimination    0.07910       0.00122 black
5                       Ethnicity    0.07475       0.00139   red
6            English.Difficulties    0.06901       0.00136 black
7                   Income_median    0.05219       0.00187 black
8                       Belonging    0.05137       0.00077 black
9        Familiarity.with.America    0.03975       0.00081 black
10            Identify.Ethnically    0.03611       0.00081 black
11                            Age    0.03282       0.00108 black
12 Familiarity.with.Ethnic.Origin    0.03249       0.00082 black
13          Duration.of.Residency    0.02983       0.00079 black
14                         Gender    0.02518       0.00084 black
15           Full.Time.Employment    0.02381       0.00081 black
16               Primary.Language    0.01325       0.00053 black
17               Health.Insurance    0.01282       0.00050 black
18                        US.Born    0.00268       0.00014 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_unmethealth.png", width=12, height=8,units="in")

Unmet Dental Needs

rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Unmet.Dental.Needs~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Unmet.Dental.Needs~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Unmet.Dental.Needs ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 16.6 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.1 secs)
    17 variables at interpretation step (in 3 secs)
    1 variables at prediction step (in 8.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Dental.Insurance"               "Religion"                      
 [3] "Ethnicity"                      "English.Speaking"              
 [5] "Income_median"                  "Familiarity.with.America"      
 [7] "English.Difficulties"           "Belonging"                     
 [9] "Discrimination"                 "Age"                           
[11] "Identify.Ethnically"            "Duration.of.Residency"         
[13] "Familiarity.with.Ethnic.Origin" "Health.Insurance"              
[15] "Gender"                         "Primary.Language"              
[17] "Full.Time.Employment"          
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.09077744

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.10495       0.00201 black
2                        Religion    0.07101       0.00098 black
3                       Ethnicity    0.06735       0.00116   red
4                English.Speaking    0.06427       0.00153 black
5                   Income_median    0.05603       0.00170 black
6        Familiarity.with.America    0.05383       0.00091 black
7            English.Difficulties    0.05107       0.00106 black
8                       Belonging    0.04901       0.00111 black
9                  Discrimination    0.04897       0.00125 black
10                            Age    0.03837       0.00072 black
11            Identify.Ethnically    0.03301       0.00101 black
12          Duration.of.Residency    0.03009       0.00052 black
13 Familiarity.with.Ethnic.Origin    0.02884       0.00074 black
14               Health.Insurance    0.02815       0.00098 black
15                         Gender    0.02530       0.00073 black
16               Primary.Language    0.02197       0.00058 black
17           Full.Time.Employment    0.01837       0.00046 black
18                        US.Born    0.00235       0.00018 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_unmetdental.png", width=12, height=8,units="in")

Physical Check-up

rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Physical.Check.up~.,
                          data=rfdata,
                          seed=3)$data

VSURF(Physical.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
# VSURF(Physical.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 24 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.7 secs)
    14 variables at interpretation step (in 3.3 secs)
    12 variables at prediction step (in 15 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Duration.of.Residency"          "Dental.Insurance"              
 [3] "Ethnicity"                      "Health.Insurance"              
 [5] "Religion"                       "EnglishDiff"                   
 [7] "EnglishSpeak"                   "Familiarity.with.Ethnic.Origin"
 [9] "Belonging"                      "Familiarity.with.America"      
[11] "Identify.Ethnically"            "Gender"                        
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Duration.of.Residency"          "Dental.Insurance"              
 [3] "Ethnicity"                      "Health.Insurance"              
 [5] "Religion"                       "Age"                           
 [7] "EnglishDiff"                    "EnglishSpeak"                  
 [9] "Familiarity.with.Ethnic.Origin" "Belonging"                     
[11] "Familiarity.with.America"       "Identify.Ethnically"           
[13] "Income_median"                  "Gender"                        
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1762462

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.05958       0.00108 black
2                Dental.Insurance    0.05820       0.00115 black
3                       Ethnicity    0.04682       0.00101   red
4                Health.Insurance    0.03525       0.00079 black
5                        Religion    0.03338       0.00078 black
6                             Age    0.03318       0.00078 black
7                     EnglishDiff    0.03246       0.00075 black
8                    EnglishSpeak    0.02352       0.00060 black
9  Familiarity.with.Ethnic.Origin    0.02108       0.00049 black
10                      Belonging    0.02087       0.00047 black
11       Familiarity.with.America    0.02022       0.00040 black
12            Identify.Ethnically    0.01734       0.00059 black
13                  Income_median    0.01670       0.00097 black
14                         Gender    0.01657       0.00060 black
15                 Discrimination    0.01428       0.00057 black
16                     Employment    0.01149       0.00032 black
17               Primary.Language    0.01130       0.00061 black
18                        US.Born    0.00658       0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_PC_ROSE.png", width=12, height=8,units="in")

Dental Check-up

rfdata <- qol |> select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Dentist.Check.up~.,
                          data=rfdata,
                          seed=3)$data

VSURF(Dentist.Check.up~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
# VSURF(Dentist.Check.up~.,rfdata,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod

vsurf.mod |> summary()

 VSURF computation time: 18.9 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.2 secs)
    12 variables at interpretation step (in 3.1 secs)
    10 variables at prediction step (in 10.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Dental.Insurance"               "Duration.of.Residency"         
 [3] "Ethnicity"                      "English.Difficulties"          
 [5] "Income_median"                  "English.Speaking"              
 [7] "Familiarity.with.Ethnic.Origin" "Identify.Ethnically"           
 [9] "Belonging"                      "Familiarity.with.America"      
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Dental.Insurance"               "Duration.of.Residency"         
 [3] "Ethnicity"                      "Religion"                      
 [5] "English.Difficulties"           "Income_median"                 
 [7] "English.Speaking"               "Familiarity.with.Ethnic.Origin"
 [9] "Age"                            "Identify.Ethnically"           
[11] "Belonging"                      "Familiarity.with.America"      
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1694799

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.10815       0.00106 black
2           Duration.of.Residency    0.05527       0.00095 black
3                       Ethnicity    0.04367       0.00062   red
4                        Religion    0.03888       0.00067 black
5            English.Difficulties    0.02921       0.00065 black
6                   Income_median    0.02900       0.00076 black
7                English.Speaking    0.02480       0.00077 black
8  Familiarity.with.Ethnic.Origin    0.01971       0.00063 black
9                             Age    0.01905       0.00053 black
10            Identify.Ethnically    0.01773       0.00051 black
11                      Belonging    0.01660       0.00037 black
12       Familiarity.with.America    0.01609       0.00049 black
13                 Discrimination    0.01418       0.00066 black
14               Health.Insurance    0.01311       0.00059 black
15                         Gender    0.01139       0.00043 black
16           Full.Time.Employment    0.00931       0.00044 black
17               Primary.Language    0.00675       0.00043 black
18                        US.Born    0.00321       0.00025 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_Dc_ROSE.png", width=12, height=8,units="in")

Folkmedicine

rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)

imbal <- ROSE::ROSE(Folkmedicine~.,
                          data=rfdata,
                          seed=3)$data

# VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
VSURF(Folkmedicine~.,imbal,na.action="na.omit",parallel=T,verbose=F)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., imbal, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()

 VSURF computation time: 27.2 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.3 secs)
    16 variables at interpretation step (in 3.2 secs)
    15 variables at prediction step (in 18.7 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
names(rfdata[,-1])[vsurf.mod$varselect.pred]
 [1] "Ethnicity"                      "English.Speaking"              
 [3] "Religion"                       "Age"                           
 [5] "English.Difficulties"           "Duration.of.Residency"         
 [7] "Familiarity.with.America"       "Familiarity.with.Ethnic.Origin"
 [9] "Belonging"                      "Full.Time.Employment"          
[11] "Identify.Ethnically"            "Gender"                        
[13] "Income_median"                  "Primary.Language"              
[15] "Dental.Insurance"              
names(rfdata[,-1])[vsurf.mod$varselect.interp]
 [1] "Ethnicity"                      "English.Speaking"              
 [3] "Religion"                       "Age"                           
 [5] "English.Difficulties"           "Duration.of.Residency"         
 [7] "Familiarity.with.America"       "Familiarity.with.Ethnic.Origin"
 [9] "Belonging"                      "Full.Time.Employment"          
[11] "Discrimination"                 "Identify.Ethnically"           
[13] "Gender"                         "Income_median"                 
[15] "Primary.Language"               "Dental.Insurance"              
plot(vsurf.mod)

vsurf.mod$mean.perf
[1] 0.1121726

Importance

vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                       Ethnicity    0.08673       0.00126   red
2                English.Speaking    0.07109       0.00168 black
3                        Religion    0.06485       0.00114 black
4                             Age    0.06303       0.00135 black
5            English.Difficulties    0.05479       0.00148 black
6           Duration.of.Residency    0.04271       0.00105 black
7        Familiarity.with.America    0.03900       0.00095 black
8  Familiarity.with.Ethnic.Origin    0.03833       0.00082 black
9                       Belonging    0.03727       0.00086 black
10           Full.Time.Employment    0.03277       0.00130 black
11                 Discrimination    0.03116       0.00083 black
12            Identify.Ethnically    0.02997       0.00084 black
13                         Gender    0.02114       0.00075 black
14                  Income_median    0.01866       0.00074 black
15               Primary.Language    0.01770       0.00091 black
16               Dental.Insurance    0.01726       0.00043 black
17               Health.Insurance    0.00745       0.00032 black
18                        US.Born    0.00460       0.00027 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename = "VSURF_importance_Alt_ROSE.png", width=12, height=8,units="in")